/* OPTIONS OBS=5000  NOREPLACE ; */    


/** This programs computes the within-industry interquartile ranges of TFPR for each industry-year on:

	1.  2002 and 2007 Census Bureau-completed non-AR CMF plants
	2.  2002 and 2007 non-AR CMF plants for which no data needed for TFPR are imputed using the industry average ratio method
            or univariate regression using only current-year data (i.e., Beta model 1 with only current-year data).  
	    That is, TVS, TIB, TIE, CM, EE, CF, PH, or WW that are imputed one of these ways are replaced with missing values, 
            and thus dropped from the estimation sample.

  INPUT FILES:    

    nberces.naics5809:  NBER-CES Productivity Database (industry-level aggregates and price indices). 
    bea.bea_naics:       3- or 4-digit NAICS industry level capital data from BEA. 
 
    allcmf.cmf2002_nonar.sas7bdat

    allcmf.cmf07asm06.sas7bdat

    allcmf.gooddata_all_inds02.sas7bdat
    allcmf.gooddata_all_inds07.sas7bdat

    allcmf.imputes_all_inds02
    allcmf.imputes_all_inds07

  OUTPUT FILES:

    allcmf.allcmf0207
    allcmf.allcmf0207_tfpr_iqrs

    allcmf.gooddataalinds0207
    allcmf.gooddataalinds0207_tfpr_iqrs

**/

%include "ASMimplibs.sas";


/* Read in just the 2002 price indexes, so that we can rescale the original 
   indexes to a 2002 base year. */

data nberces2002 (keep = naics PISHIP_2002  PIMAT_2002 PIEN_2002 PIINV_2002);
 set nberces.naics5809;
    if year=2002;
    PISHIP_2002 = PISHIP;
    PIMAT_2002 = PIMAT;
    PIEN_2002 = PIEN;
    PIINV_2002 = PIINV;
run;

data nberces0207;
 set nberces.naics5809;
 if year in (2002,2007);
run;
 
proc sort data=nberces0207; by naics; run;
proc sort data=nberces2002; by naics; run;

data nberces0207 in0207_not_in02 in02_not_in07;
 merge nberces0207 (in=in0207) nberces2002 (in=in02);
 by naics;
 if in0207 and in02 then output nberces0207;
 else if in0207 then output in0207_not_in02;
 else output in02_not_in07;
run;

/* Read in the NBER-CES productivity database for industry prices indices, and to 
   construct industry cost shares and capital shares. */ 
/* Rescale the price indexes so that 2002=1.000 */

data nberces0207 (keep = naics_char year alpha_K alpha_L alpha_E alpha_M PISHIP_new PIMAT_new PIEN_new PIINV_new 
                          keqshare kstshare);
 set nberces0207;
  alpha_L = pay/vship;
  alpha_E = energy/vship;
  alpha_M = (matcost-energy)/vship;
  alpha_K = 1 - sum(alpha_L,alpha_E,alpha_M);  
  PISHIP_new = PISHIP/PISHIP_2002;
  PIMAT_new = PIMAT/PIMAT_2002;
  PIEN_new = PIEN/PIEN_2002;
  PIINV_new = PIINV/PIINV_2002;

  totcapital = equip + plant;
  keqshare = equip/totcapital;  /* Industry-level equipment capital's share */
  kstshare = plant/totcapital;  /* Industry-level structures capital's share */
  naics_char = put(NAICS,6.);

run;

proc datasets library = work nolist;
  modify nberces0207;
  rename naics_char= NAICS;
RUN;


/* Read in the 2002 and 2007 BEA capital stock dataset (for converting book values to market values of capital stocks).  
   NAICS_BEA are mostly 3-digit NAICS codes, but some are ranges of codes (e.g., 3361-3363).
   So I need to create a new "NAICS4_BEA" variable to be able to merge the BEA data to the CMF data. 
*/

data bea_capital (keep = NAICS4_BEA year gkheq gkhst nkceq nkcst);
 set bea.bea_naics;
 if year in (2002,2007);
 NAICS4 = substr(NAICS_BEA,1,4);
 NAICS7 = substr(NAICS_BEA,1,7);
 NAICS9 = substr(NAICS_BEA,1,9);
 if NAICS4 in ("321 " , "322 " , "323 " , "324 " , "325 " , "326 " , "327 " , "331 " , "332 " , "333 " , "334 " , "335 " , "337 " , "339 ") 
 then NAICS4_BEA = NAICS4;
 else if NAICS7 = "311-312" 
 then NAICS4_BEA = "311 ";
 else if NAICS7 = "313-314" 
 then NAICS4_BEA = "313 ";
 else if NAICS7 = "315-316" 
 then NAICS4_BEA = "315 ";
 else if NAICS9 = "3361-3363"
 then NAICS4_BEA = "3361";
 else if NAICS9 = "3364-3369"
 then NAICS4_BEA = "3364";
 if NAICS4_BEA ne .;
run;


%macro merge_industry_plant(dataset=);

	/* Create a new industry variable for merging to the BEA capital data. */

	data &dataset (keep = survu_id firmid NAICS_NEW_6 NAICS4_BEA tvs tib tie cm ee cf sw ph ww tae year);
	 set &dataset;
	   NAICS3 = substr(NAICS_NEW_6,1,3);
	   NAICS4 = substr(NAICS_NEW_6,1,4);
	   if 	   NAICS3 = "321" then NAICS4_BEA = "321 ";
	   else if NAICS3 = "322" then NAICS4_BEA = "322 "; 
	   else if NAICS3 = "323" then NAICS4_BEA = "323 "; 
	   else if NAICS3 = "324" then NAICS4_BEA = "324 "; 
	   else if NAICS3 = "325" then NAICS4_BEA = "325 "; 
	   else if NAICS3 = "326" then NAICS4_BEA = "326 "; 
	   else if NAICS3 = "327" then NAICS4_BEA = "327 "; 
	   else if NAICS3 = "331" then NAICS4_BEA = "331 "; 
	   else if NAICS3 = "332" then NAICS4_BEA = "332 "; 
	   else if NAICS3 = "333" then NAICS4_BEA = "333 "; 
	   else if NAICS3 = "334" then NAICS4_BEA = "334 "; 
	   else if NAICS3 = "335" then NAICS4_BEA = "335 "; 
	   else if NAICS3 = "337" then NAICS4_BEA = "337 "; 
	   else if NAICS3 = "339" then NAICS4_BEA = "339 "; 
	   else if NAICS3 in ("311","312")
	   then NAICS4_BEA = "311 ";
	   else if NAICS3 in ("313","314")
	   then NAICS4_BEA = "313 ";
	   else if NAICS3 in ("315","316")
	   then NAICS4_BEA = "315 ";
	   else if NAICS4 in ("3361","3362","3363")
	   then NAICS4_BEA = "3361";
	   else if NAICS4 in ("3364","3365","3366","3367","3368","3369")
	   then NAICS4_BEA = "3364";
	run;

	/* Rename the 6-digit NAICS code variable for merging with the deflators dataset. */
	proc datasets library=work nolist;
	  modify &dataset;
	 rename NAICS_NEW_6 = NAICS;
	run;

	proc sort data=&dataset; by NAICS YEAR; run;
	proc sort data=nberces0207; by NAICS YEAR; run;

	/* Merge NBER deflators and cost shares with the plant-level CMF data. */

	data &dataset in_cmf_not_in_nber in_nber_not_in_cmf;
	 merge  &dataset (in=incmf) nberces0207 (in=innber); 
	 by NAICS year;
	 if incmf and innber then output &dataset;
	 else if incmf then output in_cmf_not_in_nber;
	 else output in_nber_not_in_cmf;
	run;

	proc sort data=&dataset; by NAICS4_BEA year; run;
	proc sort data=bea_capital; by NAICS4_BEA year; run;

	/* Merge BEA capital data with plant-level CMF data. */
	/* NOTE: for non-matches, we are only concerned about
	   records in the CMF that don't match to a BEA industry. */

	data &dataset in_bea_not_in_cmf in_cmf_not_in_bea;
	 merge bea_capital (in=inbea) &dataset (in=incmf);
	 by NAICS4_BEA year;
	 if inbea and incmf then output &dataset;
	 else if inbea then output in_bea_not_in_cmf;
	 else output in_cmf_not_in_bea;
	run; 

	/*  Look at the plants in the CMF that did't merge to the BEA dataset. */
	proc freq data=IN_CMF_NOT_IN_BEA;
	 tables NAICS4_BEA*year NAICS*year ;
	title "Plants in CMF that did not match to the BEA dataset";
	run;

	/* Convert nominal variables to real;
	   Impute production-worker-equivalent hours; 
	   Convert book values of assets to real market values of capital stock;
	   Calculate TFPR;
	*/
	data &dataset;
	 set &dataset;
	   dinv = tie - tib;
	   gross_output = sum(tvs,dinv);
	   q = gross_output/piship_new;
	   if q > 0 then lq = log(q);

	   keqst = keqshare*tae*(nkceq/gkheq)/piinv_new;  * market value of stock of equipment;
	   kstst = kstshare*tae*(nkcst/gkhst)/piinv_new;  * market value of stock of structures;
	   kst = sum(keqst,kstst);
	   if kst > 0 then lk = log(kst);

	   if ww>0 then th = ph*sw/ww;
	   if th>0 then lth = log(th);

	   cost_energy = sum(ee,cf);
	   realenergy = cost_energy/pien_new;
	   if realenergy>0 then le = log(realenergy);

	   realcm = cm/pimat_new;
	   matq = realcm - realenergy;
	   if matq>0 then lm = log(matq);
   
	   tfpr = lq - (alpha_K*lk + alpha_L*lth + alpha_E*le + alpha_M*lm);
	run; 


	 proc sort data=&dataset; by year NAICS; RUN;

	 proc means data=&dataset qrange NOPRINT;
	  var tfpr;
	 by year NAICS;
	 output out=allcmf.&dataset._tfpr_iqrs (keep = year NAICS tfpr_iqr) qrange=tfpr_iqr ;
	 run;

	/* Save for disclosure analysis */
        data allcmf.&dataset (keep = firmid NAICS year tfpr tvs);
         set &dataset;
	run;

	data allcmf.&dataset._tfpr_iqrs;
	 set allcmf.&dataset._tfpr_iqrs;
	  tfpr7525 = exp(tfpr_iqr);
	run;


%mend;


/* Read in the plant-level 2002 and 2007 CMF data (all tabulated non-Administrative-Records plants). */

data allcmf0207; 
 set allcmf.cmf2002_nonar
     allcmf.cmf07asm06;
run;

%merge_industry_plant(dataset=allcmf0207);

/* Now read in the 2002 and 2007 plant-level CMF data in which ratio-imputed univariate-regression-imputed data
   have been replaced with missing values. */

data gooddataalinds0207 (drop=year);
 set allcmf.gooddata_all_inds02
     allcmf.gooddata_all_inds07;
 year_num = input(year,4.);
run;

proc datasets library=work nolist;
  modify gooddataalinds0207;
  rename year_num = year;
run;


%merge_industry_plant(dataset=gooddataalinds0207);



/**************/
/*  Table XX  */
/*************/

proc sort data = allcmf.allcmf0207_tfpr_iqrs; by year; run;

proc means data=allcmf.allcmf0207_tfpr_iqrs N q1 median mean stddev q3;
 var tfpr_iqr ;
by year;
title1 "Table XX: Distribution of NAIC6 industry IQRs of TFPR in Bureau-completed data";
run;

proc means data= allcmf.allcmf0207_tfpr_iqrs N q1 median mean stddev q3;
 var tfpr7525 ;
by year;
title1 "Table 3: Distribution of NAIC6 industry 75-25 ratios of TFPR in Bureau-completed data";
run;



proc sort data = allcmf.gooddataalinds0207_tfpr_iqrs; by year; run;

proc means data=allcmf.gooddataalinds0207_tfpr_iqrs N q1 median mean stddev q3;
 var tfpr_iqr ;
by year;
title1 "Table XX: Distribution of NAIC6 industry IQRs of TFPR in Complete-cases data";
run;


proc means data=allcmf.gooddataalinds0207_tfpr_iqrs N q1 median mean stddev q3;
 var tfpr7525 ;
by year;
title1 "Table 3: Distribution of NAIC6 industry 75-25 ratios of TFPR in Complete-cases data";
run;


